In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import svm  
from ipywidgets import interact
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from sklearn.neural_network import MLPClassifier
import plotly.express as px
import seaborn as sn
from pandas.plotting import scatter_matrix
In [2]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth',1000)

Data Preprocessing

In [3]:
df=pd.read_excel("international_matches.xlsx")
df.tail(5)
Out[3]:
date home_team away_team home_team_continent away_team_continent home_team_fifa_rank away_team_fifa_rank home_team_total_fifa_points away_team_total_fifa_points home_team_score away_team_score tournament city country neutral_location shoot_out home_team_result home_team_goalkeeper_score away_team_goalkeeper_score home_team_mean_defense_score home_team_mean_offense_score home_team_mean_midfield_score away_team_mean_defense_score away_team_mean_offense_score away_team_mean_midfield_score
19803 2022-06-14 Moldova Andorra Europe Europe 180 153 932 1040 2 1 UEFA Nations League Chișinău Moldova False No Win 65.0 NaN NaN NaN NaN NaN NaN NaN
19804 2022-06-14 Liechtenstein Latvia Europe Europe 192 135 895 1105 0 2 UEFA Nations League Vaduz Liechtenstein False No Lose NaN 65.0 NaN NaN NaN NaN NaN NaN
19805 2022-06-14 Chile Ghana South America Africa 28 60 1526 1387 0 0 Kirin Cup Suita Japan True Yes Lose 79.0 74.0 75.5 76.7 78.2 75.5 76.0 78.2
19806 2022-06-14 Japan Tunisia Asia Africa 23 35 1553 1499 0 3 Kirin Cup Suita Japan False No Lose 73.0 NaN 75.2 75.0 77.5 70.8 72.3 74.0
19807 2022-06-14 Korea Republic Egypt Asia Africa 29 32 1519 1500 4 1 Friendly Seoul Korea Republic False No Win 75.0 NaN 73.0 80.0 73.8 NaN 79.3 70.8
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19808 entries, 0 to 19807
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   date                           19808 non-null  datetime64[ns]
 1   home_team                      19808 non-null  object        
 2   away_team                      19808 non-null  object        
 3   home_team_continent            19808 non-null  object        
 4   away_team_continent            19808 non-null  object        
 5   home_team_fifa_rank            19808 non-null  int64         
 6   away_team_fifa_rank            19808 non-null  int64         
 7   home_team_total_fifa_points    19808 non-null  int64         
 8   away_team_total_fifa_points    19808 non-null  int64         
 9   home_team_score                19808 non-null  int64         
 10  away_team_score                19808 non-null  int64         
 11  tournament                     19808 non-null  object        
 12  city                           19808 non-null  object        
 13  country                        19808 non-null  object        
 14  neutral_location               19808 non-null  bool          
 15  shoot_out                      19808 non-null  object        
 16  home_team_result               19808 non-null  object        
 17  home_team_goalkeeper_score     8379 non-null   float64       
 18  away_team_goalkeeper_score     8095 non-null   float64       
 19  home_team_mean_defense_score   7787 non-null   float64       
 20  home_team_mean_offense_score   8510 non-null   float64       
 21  home_team_mean_midfield_score  8162 non-null   float64       
 22  away_team_mean_defense_score   7564 non-null   float64       
 23  away_team_mean_offense_score   8312 non-null   float64       
 24  away_team_mean_midfield_score  7979 non-null   float64       
dtypes: bool(1), datetime64[ns](1), float64(8), int64(6), object(9)
memory usage: 3.6+ MB

Columns like home_team_continent, away_team_continent, neutral_location, country, city,tournament, shoot_out are not pertinent to our objective hence we're removing them.

In [5]:
list(df.columns)
Out[5]:
['date',
 'home_team',
 'away_team',
 'home_team_continent',
 'away_team_continent',
 'home_team_fifa_rank',
 'away_team_fifa_rank',
 'home_team_total_fifa_points',
 'away_team_total_fifa_points',
 'home_team_score',
 'away_team_score',
 'tournament',
 'city',
 'country',
 'neutral_location',
 'shoot_out',
 'home_team_result',
 'home_team_goalkeeper_score',
 'away_team_goalkeeper_score',
 'home_team_mean_defense_score',
 'home_team_mean_offense_score',
 'home_team_mean_midfield_score',
 'away_team_mean_defense_score',
 'away_team_mean_offense_score',
 'away_team_mean_midfield_score']
In [6]:
clean_data = df.drop(['home_team_continent', 'away_team_continent', 'neutral_location', 'country', 'city', 'tournament', 'shoot_out'], axis='columns')

We do not need match-data for teams that are not playing in FIFA 2022, so we'll be dropping the records for irrelevant matches. We won't be including matches that resulted in a draw.

In [7]:
playing_teams=[
    "Qatar", "Ecuador", "Senegal", "Netherlands",
    "England", "Iran", "USA", "Wales",
    "Argentina", "Saudi Arabia", "Mexico", "Poland",
    "France", "Australia", "Denmark", "Tunisia",
    "Spain", "Costa Rica", "Germany", "Japan",
    "Belgium", "Canada", "Morocco", "Croatia",
    "Brazil", "Serbia", "Switzerland", "Cameroon",
    "Portugal", "Ghana", "Uruguay", "Korea Republic"
]
clean_data = clean_data[(clean_data.home_team.isin(playing_teams) | clean_data.away_team.isin(playing_teams)) & (clean_data.home_team_result != "Draw")].reset_index(drop=True)
In [8]:
clean_data.columns
Out[8]:
Index(['date', 'home_team', 'away_team', 'home_team_fifa_rank',
       'away_team_fifa_rank', 'home_team_total_fifa_points',
       'away_team_total_fifa_points', 'home_team_score', 'away_team_score',
       'home_team_result', 'home_team_goalkeeper_score',
       'away_team_goalkeeper_score', 'home_team_mean_defense_score',
       'home_team_mean_offense_score', 'home_team_mean_midfield_score',
       'away_team_mean_defense_score', 'away_team_mean_offense_score',
       'away_team_mean_midfield_score'],
      dtype='object')

The column names home_team and away_team will be renamed to team_A and team_B since the home and awayprefixes are irrelevant with respect to the World Cup.

In [9]:
clean_data = clean_data.rename(columns={col: col.replace("home_team", "team_A").replace("away_team", "team_B") for col in clean_data.columns})

We'll add a new column result, which is set to 1 when team_A wins and set to 0 when team_B wins.

In [10]:
clean_data['result'] = clean_data.team_A_result.apply(lambda x: 1 if x == "Win" else 0)
clean_data.drop(['team_A_result'], axis='columns', inplace=True)
In [11]:
clean_data.columns
Out[11]:
Index(['date', 'team_A', 'team_B', 'team_A_fifa_rank', 'team_B_fifa_rank',
       'team_A_total_fifa_points', 'team_B_total_fifa_points', 'team_A_score',
       'team_B_score', 'team_A_goalkeeper_score', 'team_B_goalkeeper_score',
       'team_A_mean_defense_score', 'team_A_mean_offense_score',
       'team_A_mean_midfield_score', 'team_B_mean_defense_score',
       'team_B_mean_offense_score', 'team_B_mean_midfield_score', 'result'],
      dtype='object')

The following cell creates country wise statistics.

In [12]:
country_data_A = clean_data.drop([col for col in clean_data.columns if "team_B" in col], axis='columns')
country_data_A.rename(columns = {"team_A": "country"}, inplace=True)
country_data_A.rename(columns = {col : col.replace("team_A_", "") for col in country_data_A.columns}, inplace=True)
country_data_B = clean_data.drop([col for col in clean_data.columns if "team_A" in col], axis='columns')
country_data_B.rename(columns = {"team_B": "country"}, inplace=True)
country_data_B.rename(columns = {col : col.replace("team_B_", "") for col in country_data_B.columns}, inplace=True)
country_data_B['result'] = 1 - country_data_B['result']
In [13]:
country_data = pd.concat([country_data_A, country_data_B])
In [14]:
country_data.tail(10)
Out[14]:
date country fifa_rank total_fifa_points score goalkeeper_score mean_defense_score mean_offense_score mean_midfield_score result
5607 2022-06-13 Austria 34 1500 0 74.0 80.2 77.0 80.8 0
5608 2022-06-13 Croatia 16 1621 1 82.0 77.8 76.7 84.2 1
5609 2022-06-14 New Zealand 101 1206 0 68.0 70.2 70.7 69.2 0
5610 2022-06-14 Hungary 40 1466 4 85.0 75.5 72.7 73.0 1
5611 2022-06-14 Italy 6 1723 2 89.0 84.2 85.3 84.5 0
5612 2022-06-14 Wales 18 1588 2 74.0 75.0 73.0 78.5 0
5613 2022-06-14 Belgium 2 1827 1 89.0 80.8 85.7 85.5 1
5614 2022-06-14 Ghana 60 1387 0 74.0 75.5 76.0 78.2 1
5615 2022-06-14 Tunisia 35 1499 3 NaN 70.8 72.3 74.0 1
5616 2022-06-14 Egypt 32 1500 1 NaN NaN 79.3 70.8 0
In [ ]:
# def get_running_avgs(row):
#     prev_match_a = country_data[(country_data.date < row['date']) & (country_data.country == row['team_A'])].sort_values(by = 'date').head(5)
#     prev_match_b = country_data[(country_data.date < row['date']) & (country_data.country == row['team_B'])].sort_values(by = 'date').head(5)
#     if prev_match_a.shape[0] == 0:
#         row['avg_score_A'] = row['team_A_score']
#         row['avg_gk_score_A'] = row['team_A_goalkeeper_score']
#         row['avg_defense_A'] = row['team_A_mean_defense_score']
#         row['avg_mid_A'] = row['team_A_mean_midfield_score']
#         row['avg_offense_A'] = row['team_A_mean_offense_score']
#     else:
#         row['avg_score_A'] = prev_match_a['score'].mean()
#         row['avg_gk_score_A'] = prev_match_a['goalkeeper_score'].mean()
#         row['avg_defense_A'] = prev_match_a['mean_defense_score'].mean()
#         row['avg_mid_A'] = prev_match_a['mean_midfield_score'].mean()
#         row['avg_offense_A'] = prev_match_a['mean_offense_score'].mean()
#     if prev_match_b.shape[0] == 0:
#         row['avg_score_B'] = row['team_B_score']
#         row['avg_gk_score_B'] = row['team_B_goalkeeper_score']
#         row['avg_defense_B'] = row['team_B_mean_defense_score']
#         row['avg_mid_B'] = row['team_B_mean_midfield_score']
#         row['avg_offense_B'] = row['team_B_mean_offense_score']
#     else:
#         row['avg_score_B'] = prev_match_b['score'].mean()
#         row['avg_gk_score_B'] = prev_match_b['goalkeeper_score'].mean()
#         row['avg_defense_B'] = prev_match_b['mean_defense_score'].mean()
#         row['avg_mid_B'] = prev_match_b['mean_midfield_score'].mean()
#         row['avg_offense_B'] = prev_match_b['mean_offense_score'].mean()
#     return row

EDA

In [15]:
data = clean_data.copy(deep=True)
column_list = ['fifa_rank','total_fifa_points','score','goalkeeper_score', 'mean_defense_score', 'mean_offense_score', 'mean_midfield_score']
In [16]:
fig = px.scatter_matrix(country_data[column_list],labels={col:col.replace('_score', '') for col in column_list})
fig.update_layout({"title": "Scatter Plot Matrix Country Wise Data" ,'height': 800, 'width': 1000, 'font_size':8.6})
fig.update_layout({"xaxis"+str(i+1): dict(tickangle = -45) for i in range(7)})
fig.update_traces(diagonal_visible=False,marker={'size': 3})
fig.show()

We can see that the columns fifa_rank and total_fifa_points are negatively correlated, which is as expected since teams with a good rank are expected to have higher rank. goalkeeper_score is positively correlated with total_fifa_points, mean_denfense, mean_offense. The following cell shows the correlation matrix for the same data

In [17]:
sn.heatmap(country_data[column_list].corr(),annot=True)
plt.show()

We can see that mean_defense_score, mean_offense_score, goalkeeper_score, have high correlation among them, this can be a cause for multicollinearity, so we will only include mean_midfield_score, goalkeeper_score in the model.

In [18]:
ctry_data = country_data.copy(deep=True)
ctry_data['year'] = ctry_data['date'].dt.to_period('Y')
In [19]:
grp_data = ctry_data.groupby(['country', 'year'], as_index=False).agg({'score':'mean', 'result':['count','sum'], 'goalkeeper_score':'mean', 'mean_defense_score':'mean', 'mean_offense_score':'mean','mean_midfield_score':'mean'})
wc_semi = ctry_data[ctry_data.country.isin(["Morocco", "France", "Argentina", "Croatia"])]
wc_semi = wc_semi.groupby("country", as_index=False).mean()
fig = px.bar(wc_semi, 
             x="country",
             y="goalkeeper_score",
             color="country",
             text_auto=True,
             labels={
                     "country": "Country",
                     "goalkeeper_score": "Goalkeeper Score",
                 })
fig.update_layout({"title": "GoalKeeper Score vs Country (2022 Season) Top 4"})
fig.show()
In [20]:
fig = px.bar(wc_semi, 
             x="country",
             y="mean_midfield_score",
             color="country",
             text_auto=True,
             labels = {
                 "country": "Country",
                 "mean_midfield_score": "Mean Midfield Score"
             }, 
             title = "Mean Midfield Score vs Country (2022 Season) Top 4")

fig.show()

Looking at the difference between the goals score by winner and the loser, the following chart shows the goal difference for the top 4 teams

In [21]:
import warnings
warnings.filterwarnings("ignore")
wc_data = data[data.team_A.isin(["Morocco", "France", "Argentina", "Croatia"])]
wc_data['goal_diff'] = wc_data['team_A_score'] - wc_data['team_B_score']
wc_data.rename(columns = {'team_A' : 'country'}, inplace=True)
wc_data2 = data[data.team_B.isin(["Morocco", "France", "Argentina", "Croatia"])]
wc_data2['goal_diff'] = wc_data['team_B_score'] - wc_data['team_A_score']
wc_data2.rename(columns = {'team_B': 'country'}, inplace=True)
wc_data = pd.concat([wc_data, wc_data2])[['country','goal_diff','date']]
wc_data_grp = wc_data[wc_data.date >= "2015-01-01"].groupby('country', as_index=False).mean().sort_values(by='goal_diff', ascending=False)
fig = px.bar(wc_data_grp, x='country',y='goal_diff',color='country', title = "Goal Difference vs Country",
             labels= {
                 'country': 'Country',
                 'goal_diff': 'Goal Difference(Top 4)'
             })
fig.show()

fifa_rank is an important variable. Given two teams the difference between their fifa_ranks should be a good metric. Difference between fifa_points is also a good metric.

In [22]:
clean_data['rank_diff'] = clean_data['team_A_fifa_rank'] - clean_data['team_B_fifa_rank']
clean_data['point_diff'] = clean_data['team_A_total_fifa_points'] - clean_data['team_B_total_fifa_points']

The following cell calculates the win_rate for each rank_diff

In [23]:
wr_rd = clean_data.groupby('rank_diff', as_index=False).agg({'result': ['sum','count']})

wr_rd.columns = [col[0] + col[1] for col in wr_rd.columns]
wr_rd['win_rate'] = wr_rd['resultsum']/wr_rd['resultcount']
fig = px.scatter(
    x=wr_rd['rank_diff'], 
    y=wr_rd['win_rate'],
    title="Win Rate vs Rank Diff",
    labels={
        'x': 'Rank Difference',
        'y': 'Win Rate'
    },
    height=500,
    width=500
)
fig.update_layout()
fig.show()

We can see that when rank_diff > 0, meaning opposition is a better team, the bigger the difference the smaller the win_rate.

Logistic Classification

In [24]:
def plot_cm(matrix, title):
  cm = [[val/sum(row) for val in row] for row in matrix]  
  plt.figure(figsize=(5,5))
  sn.heatmap(cm, annot=True)
  plt.title(title)
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
In [25]:
X = clean_data[[ 'team_A', 'team_B', 'rank_diff', 'point_diff', 'team_A_mean_midfield_score', 'team_B_mean_midfield_score','team_A_goalkeeper_score','team_B_goalkeeper_score']]
Y = clean_data['result']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
categorical_features = ['team_A', 'team_B']
In [26]:
def logistic_classification_model(X_train, X_test, y_train, y_test):
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")
    preprocessor = ColumnTransformer(transformers = [
        ("cat", categorical_transformer, categorical_features)
    ])
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(class_weight='balanced'))]
    )
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = clf.score(X_test, y_test)
    cm  = confusion_matrix(y_test, pred)
    return clf,pred,score, cm
    
In [27]:
logistic_clf,predictions,logistic_acc,logistic_cm = logistic_classification_model(X_train, X_test, y_train, y_test)
In [28]:
pred = logistic_clf.predict(X_test)
In [29]:
logistic_acc
Out[29]:
0.75355871886121
In [30]:
logistic_cm
Out[30]:
array([[338, 101],
       [176, 509]], dtype=int64)
In [31]:
plot_cm(logistic_cm, "Logistic Model")

Random Forest

In [32]:
def random_forest_model(X_train, X_test, y_train, y_test):
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")
    preprocessor = ColumnTransformer(transformers = [
        ("cat", categorical_transformer, categorical_features)
    ])
    rand_forest_clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(class_weight='balanced'))]
    )
    rand_forest_clf.fit(X_train, y_train)
    score = rand_forest_clf.score(X_test,y_test)
    pred = rand_forest_clf.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    return rand_forest_clf, pred, score, cm
In [33]:
random_forest_model, preds, random_forest_score, random_forest_confusion_mat = random_forest_model(X_train, X_test, y_train, y_test)
In [34]:
random_forest_score
Out[34]:
0.7250889679715302
In [35]:
random_forest_confusion_mat
Out[35]:
array([[292, 147],
       [162, 523]], dtype=int64)
In [36]:
plot_cm(random_forest_confusion_mat, "Random Forest")

Neural Network

In [37]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow import feature_column
from tensorflow.keras.utils import to_categorical
In [38]:
checkpoint_filepath = './checkpoints'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', 
    verbose=1,
    patience=20,
    mode='max',
    restore_best_weights=True)
In [55]:
data = clean_data[[ 'team_A', 'team_B', 'rank_diff', 'point_diff', 'team_A_mean_midfield_score', 'team_B_mean_midfield_score','team_A_goalkeeper_score','team_B_goalkeeper_score','result']].copy(deep=True)
le_a = LabelEncoder()
le_b = LabelEncoder()
le_a.fit(data['team_A'])
le_b.fit(data['team_B'])
data['team_B'] = le_b.transform(data['team_B'])
data['team_A'] = le_a.transform(data['team_A'])
data.dropna(inplace=True)
neg, pos = np.bincount(data['result'])
total = neg + pos
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}
train_df, test_df = train_test_split(data, test_size=0.2)
train_df, val_df = train_test_split(data, test_size=0.2)
train_labels = np.array(train_df.pop("result"))
test_labels = np.array(test_df.pop("result"))
val_labels = np.array(val_df.pop("result"))
In [56]:
train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)
In [81]:
model = tf.keras.Sequential([
    layers.Dense(512, input_shape=(train_features.shape[-1],), activation=tf.nn.relu),
    layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.01),
    layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.01),
    layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.01),
    layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.01),
    layers.Dense(1, activation=tf.nn.sigmoid)
    ])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss = tf.keras.losses.binary_crossentropy,
    metrics='accuracy'
)
In [82]:
history = model.fit(
    train_features, 
    train_labels,
    epochs=50,
    batch_size=50,
    class_weight=class_weight,
    callbacks=[model_checkpoint_callback, early_stopping],
    validation_data=(val_features, val_labels)
)
Epoch 1/50
40/40 [==============================] - 1s 13ms/step - loss: 1.1462 - accuracy: 0.5399 - val_loss: 0.7074 - val_accuracy: 0.6469
Epoch 2/50
40/40 [==============================] - 0s 10ms/step - loss: 0.7077 - accuracy: 0.6391 - val_loss: 0.6579 - val_accuracy: 0.6490
Epoch 3/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6797 - accuracy: 0.6554 - val_loss: 0.6605 - val_accuracy: 0.6490
Epoch 4/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6592 - accuracy: 0.6662 - val_loss: 0.6396 - val_accuracy: 0.6429
Epoch 5/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6459 - accuracy: 0.6825 - val_loss: 0.6511 - val_accuracy: 0.6490
Epoch 6/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6439 - accuracy: 0.6733 - val_loss: 0.6489 - val_accuracy: 0.6469
Epoch 7/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6351 - accuracy: 0.6723 - val_loss: 0.6382 - val_accuracy: 0.6449
Epoch 8/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6211 - accuracy: 0.6825 - val_loss: 0.6398 - val_accuracy: 0.6449
Epoch 9/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6298 - accuracy: 0.6708 - val_loss: 0.6314 - val_accuracy: 0.6449
Epoch 10/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6181 - accuracy: 0.6933 - val_loss: 0.6333 - val_accuracy: 0.6531
Epoch 11/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6005 - accuracy: 0.6876 - val_loss: 0.6273 - val_accuracy: 0.6531
Epoch 12/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6341 - accuracy: 0.6764 - val_loss: 0.6423 - val_accuracy: 0.6469
Epoch 13/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6241 - accuracy: 0.6794 - val_loss: 0.6577 - val_accuracy: 0.6245
Epoch 14/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6245 - accuracy: 0.6708 - val_loss: 0.6366 - val_accuracy: 0.6469
Epoch 15/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6165 - accuracy: 0.6800 - val_loss: 0.6421 - val_accuracy: 0.6408
Epoch 16/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6120 - accuracy: 0.6774 - val_loss: 0.6419 - val_accuracy: 0.6469
Epoch 17/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5937 - accuracy: 0.6907 - val_loss: 0.6220 - val_accuracy: 0.6510
Epoch 18/50
40/40 [==============================] - 0s 11ms/step - loss: 0.6174 - accuracy: 0.6907 - val_loss: 0.6258 - val_accuracy: 0.6531
Epoch 19/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5998 - accuracy: 0.6922 - val_loss: 0.6384 - val_accuracy: 0.6449
Epoch 20/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5986 - accuracy: 0.6876 - val_loss: 0.6376 - val_accuracy: 0.6367
Epoch 21/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5901 - accuracy: 0.6933 - val_loss: 0.6352 - val_accuracy: 0.6469
Epoch 22/50
40/40 [==============================] - 0s 10ms/step - loss: 0.6027 - accuracy: 0.6897 - val_loss: 0.6284 - val_accuracy: 0.6510
Epoch 23/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5894 - accuracy: 0.7030 - val_loss: 0.6328 - val_accuracy: 0.6510
Epoch 24/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5900 - accuracy: 0.6927 - val_loss: 0.6202 - val_accuracy: 0.6551
Epoch 25/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5943 - accuracy: 0.6927 - val_loss: 0.6483 - val_accuracy: 0.6163
Epoch 26/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5755 - accuracy: 0.7055 - val_loss: 0.6379 - val_accuracy: 0.6531
Epoch 27/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5720 - accuracy: 0.7025 - val_loss: 0.6234 - val_accuracy: 0.6531
Epoch 28/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5960 - accuracy: 0.6938 - val_loss: 0.6296 - val_accuracy: 0.6673
Epoch 29/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5840 - accuracy: 0.6984 - val_loss: 0.6552 - val_accuracy: 0.6327
Epoch 30/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5928 - accuracy: 0.7009 - val_loss: 0.6415 - val_accuracy: 0.6469
Epoch 31/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5845 - accuracy: 0.6887 - val_loss: 0.6220 - val_accuracy: 0.6653
Epoch 32/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5784 - accuracy: 0.7055 - val_loss: 0.6350 - val_accuracy: 0.6531
Epoch 33/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5835 - accuracy: 0.7111 - val_loss: 0.6349 - val_accuracy: 0.6592
Epoch 34/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5694 - accuracy: 0.7096 - val_loss: 0.6345 - val_accuracy: 0.6551
Epoch 35/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5782 - accuracy: 0.7014 - val_loss: 0.6248 - val_accuracy: 0.6551
Epoch 36/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5681 - accuracy: 0.7111 - val_loss: 0.6256 - val_accuracy: 0.6531
Epoch 37/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5735 - accuracy: 0.7101 - val_loss: 0.6224 - val_accuracy: 0.6673
Epoch 38/50
40/40 [==============================] - 0s 12ms/step - loss: 0.5794 - accuracy: 0.7009 - val_loss: 0.6192 - val_accuracy: 0.6694
Epoch 39/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5823 - accuracy: 0.6968 - val_loss: 0.6370 - val_accuracy: 0.6531
Epoch 40/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5650 - accuracy: 0.7086 - val_loss: 0.6247 - val_accuracy: 0.6673
Epoch 41/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5830 - accuracy: 0.6963 - val_loss: 0.6434 - val_accuracy: 0.6653
Epoch 42/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5846 - accuracy: 0.6984 - val_loss: 0.6253 - val_accuracy: 0.6592
Epoch 43/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5730 - accuracy: 0.7101 - val_loss: 0.6449 - val_accuracy: 0.6551
Epoch 44/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5713 - accuracy: 0.7147 - val_loss: 0.6265 - val_accuracy: 0.6735
Epoch 45/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5740 - accuracy: 0.7137 - val_loss: 0.6350 - val_accuracy: 0.6735
Epoch 46/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5622 - accuracy: 0.7198 - val_loss: 0.6371 - val_accuracy: 0.6490
Epoch 47/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5701 - accuracy: 0.6989 - val_loss: 0.6315 - val_accuracy: 0.6673
Epoch 48/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5582 - accuracy: 0.7096 - val_loss: 0.6360 - val_accuracy: 0.6531
Epoch 49/50
40/40 [==============================] - 0s 10ms/step - loss: 0.5782 - accuracy: 0.7219 - val_loss: 0.6449 - val_accuracy: 0.6551
Epoch 50/50
40/40 [==============================] - 0s 11ms/step - loss: 0.5556 - accuracy: 0.7255 - val_loss: 0.6273 - val_accuracy: 0.6673
In [83]:
plt.plot(history.epoch, history.history['accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title("Accuracy Curve")
plt.show()
In [84]:
plt.plot(history.epoch, history.history['loss'])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.show()
In [85]:
test_predictions_baseline = model.predict(test_features, batch_size=10)
49/49 [==============================] - 0s 1ms/step
In [86]:
baseline_results = model.evaluate(test_features, test_labels,
                                  batch_size=10, verbose=0)
In [87]:
model.evaluate(test_features, test_labels,
                                  batch_size=10)
49/49 [==============================] - 0s 1ms/step - loss: 0.5290 - accuracy: 0.7184
Out[87]:
[0.5290290713310242, 0.718367338180542]
In [88]:
def plot_cm_tf(labels, predictions):
  cm = confusion_matrix(labels, predictions)
  cm = [[val/sum(row) for val in row] for row in cm]
  plt.figure(figsize=(5,5))
  sn.heatmap(cm, annot=True)
  plt.title('Neural Network Confusion matrix')
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')
In [89]:
test_preds_roff = np.round(test_predictions_baseline)
In [90]:
plot_cm_tf(test_labels, test_preds_roff)

Prediction

In [77]:
def predict_outcome(team1, team2):
    team1_data = country_data[(country_data.country == team1) & (country_data.date > "2019-06-01")]
    team2_data = country_data[(country_data.country == team2) & (country_data.date > "2019-06-01")]
    rank_team1 = team1_data.fifa_rank.median()
    rank_team2 = team2_data.fifa_rank.median()
    gk_score1 = team1_data.goalkeeper_score.mean()
    gk_score2 = team2_data.goalkeeper_score.mean()
    midfield_score1 = team1_data.mean_midfield_score.mean()
    midfield_score2 = team2_data.mean_midfield_score.mean()
    win_rate1 = team1_data.result.sum()/team1_data.shape[1]
    win_rate2 =team2_data.result.sum()/team2_data.shape[1]
    if rank_team1 == rank_team2:
        rank_team2 = team2_data.fifa_rank.unique()[1]
    fifa_point1 = team1_data.total_fifa_points.mean()
    fifa_point2 = team2_data.total_fifa_points.mean()
    fig1 = px.bar(
        x=[team1, team2], 
        y=[fifa_point1, fifa_point2],
        title="Avg FIFA Points (2022 Season)",
        labels= {"x": "Country", "y": "FIFA Points"},
        color=[team1, team2],
        )
    fig1.show()
    
    fig2 = px.bar(
        x=[team1, team2], 
        y=[win_rate1, win_rate2],
        title="Win Rate (2022 Season)",
        labels= {"x": "Country", "y": "Win Rate"},
        color=[team1, team2],
        )
    fig2.show()
    fig3 = px.bar(
        x=[team1, team2], 
        y=[gk_score1, gk_score2],
        title="Avg Goalkeeper Score (2022 Season)",
        labels= {"x": "Country", "y": "Avg Goalkeeper Score"},
        color=[team1, team2],
        )
    fig3.show()
    fig4 = px.bar(
        x=[team1, team2], 
        y=[gk_score1, gk_score2],
        title="Avg Midfield Score (2022 Season)",
        labels= {"x": "Country", "y": "Avg Midfield Score"},
        color=[team1, team2],
        )
    fig4.show()
    df = pd.DataFrame([{
        "team_A": team1, 
        "team_B": team2,
        "rank_diff": rank_team1 - rank_team2,
        "point_diff": fifa_point1 - fifa_point2,
        "team_A_mean_midfield_score": midfield_score1,
        "team_B_mean_midfield_score": midfield_score2,
        "team_A_goalkeeper_score": gk_score1,
        "team_B_goalkeeper_score": gk_score2,
        }])
    #Logistic
    display(df)
    logistic_prediction = team1 if logistic_clf.predict(df)[0] == 1 else team2
    random_forest_prediction = team1 if random_forest_model.predict(df)[0] == 1 else team2
    df['team_B'] = le_b.transform(df['team_B'])
    df['team_A'] = le_a.transform(df['team_A'])
    mlp_prediction = model.predict(df, verbose=0)[0][0]
    mlp_prediction = team1 if mlp_prediction > 0.5 else team2
    predictions = pd.DataFrame(
        [
            {
                "Model": "Logistic",
                "Predicted Winner": logistic_prediction
            },
            {
                "Model": "Random Forest",
                "Predicted Winner": random_forest_prediction
            },
            {
                "Model": "Multilayer Perceptron (Neural Network)",
                "Predicted Winner": mlp_prediction
            },
        ]
    )
    return predictions
    
In [78]:
country_list = country_data.country.unique()
country_list.sort()
team1_dropdown = widgets.Dropdown(
    options=country_list,
    value="France"
)
team2_dropdown = widgets.Dropdown(
    options=country_list,
    value="Argentina"
)
In [79]:
op = interact(predict_outcome, team1=team1_dropdown, team2=team2_dropdown)
In [80]:
predict_outcome("France", "Argentina")
team_A team_B rank_diff point_diff team_A_mean_midfield_score team_B_mean_midfield_score team_A_goalkeeper_score team_B_goalkeeper_score
0 France Argentina -6.0 91.654762 87.36 83.539286 87.266667 82.392857
Out[80]:
Model Predicted Winner
0 Logistic Argentina
1 Random Forest Argentina
2 Multilayer Perceptron (Neural Network) France
In [ ]: